import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.image import imread
%matplotlib inline
pd.set_option('display.max_columns', 100)
import missingno as msno
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly
from wordcloud import WordCloud,STOPWORDS, ImageColorGenerator
import cv2
import os
from os import path
# Tokenizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
# Stop words
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# Lemmatizer (base d'un mot)
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from PIL import Image
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
import gensim
from gensim.models import Word2Vec
from gensim.utils import tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import transformers
from transformers import TFAutoModel, AutoTokenizer
import tensorflow_hub as hub
import tensorflow_text
embed = hub.load("universal-sentence-encoder-large_5/")
#embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
import tensorflow as tens
import time
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("Flipkart/flipkart_com-ecommerce_sample_1050.csv", encoding="utf-8-sig")
print("Le jeu de données flipkart_com-ecommerce_sample contient %d lignes et %d colonnes." % (data.shape[0], data.shape[1]))
data.head()
data.info()
msno.matrix(data)
data.isna().sum()
data = data.dropna(subset=['retail_price', 'discounted_price', 'product_specifications'], how='all')
#data["product_category_tree"].tolist()
data['product_category_tree'].nunique()
data['product_category_tree'].str.split(">>", expand=True)
data['product_category_tree'].str.split(">>", n=2, expand=True)
data["product_category_1"] = data["product_category_tree"].apply(
lambda x: x.split('["')[1].split('"]')[0].split(">>")[0]
)
data["product_category_2"] = data["product_category_tree"].apply(
lambda x: x.split('["')[1].split('"]')[0].split(">>")[1]
)
data["product_category_3"] = data["product_category_tree"].apply(
lambda x: x.split('["')[1].split('"]')[0].split(">>")[2]
if len(x.split(">>")) > 2
else ""
)
data.head()
data["product_category_1"]
baby = data[data["product_category_1"] == "Baby Care "]
fig = go.Figure(go.Parcats(
dimensions=[
{'values': baby.product_category_1.values},
{'values': baby.product_category_2.values},
{'values': baby.product_category_3.values}]
))
fig.update_layout(
title="Product Category 'Baby Care' décomposée"
)
fig.show()
home = data[data["product_category_1"] == "Home Furnishing "]
fig = go.Figure(go.Parcats(
dimensions=[
{'values': home.product_category_1.values},
{'values': home.product_category_2.values},
{'values': home.product_category_3.values}]
))
fig.update_layout(
title="Product Category 'Home Furnishing' décomposée"
)
fig.show()
data = data.drop(['product_category_tree'], axis=1)
for col in ["product_category_1", "product_category_2", "product_category_3"]:
print(f"Nombre de catégories pour la colonne {col} = {data[col].nunique()}")
print(data[col].value_counts().sort_values(ascending=False))
print("-" * 80)
for elem in data['product_category_1'].unique():
print('---',elem,'---')
wordcloud = WordCloud(max_words=50).generate(' '.join(list(data[data['product_category_1']==elem]['description'])))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
#data["product_specifications"].tolist()
# Fonction pour converture le contenu du champs product_specifications
def json_reformat(a):
a="{"+a[26:-1].replace('\"key\"=>',"").replace('\"value\"=>',"").replace("\", \"","\":\"").replace("[","").replace("]","").replace("{","").replace("}","")+"}"
return a
data["product_specifications"]=data["product_specifications"].apply(lambda x: json_reformat(x) if x is not np.nan else "")
data.head()
data["product_rating"].unique()
fig = px.bar(data["product_rating"].value_counts(),
title="Nombre d'avis par produits")
fig.show()
fig = px.bar(data["overall_rating"].value_counts(),
title="Nombre d'avis")
fig.show()
data = data.drop(['overall_rating'], axis=1)
fig = px.bar(data["brand"].value_counts(),
title="Marque des produits")
fig.show()
## Lowercase
data['description'] = data['description'].apply(lambda x: " ".join(x.lower() for x in x.split()))
## remove punctuation
data['description'] = data['description'].str.replace('[^\w\s]','')
Cette tâche consiste à prendre une longue chaîne de texte et convertit chaque mot en un «jeton» ou une valeur et les place dans une liste. Les valeurs de la liste sont beaucoup plus faciles à manipuler par les étapes ultérieures.
def tokenize(column):
tokens = nltk.word_tokenize(column)
return [w for w in tokens if w.isalpha()]
data['tokenized'] = data.apply(lambda x: tokenize(x['description']), axis=1)
data.head()
Cette étape consiste à réduire le bruit dans nos données en supprimant les "Stopwords". Ce sont des mots spéciaux spécifiques à une langue qui apparaissent dans une phrase et qui ajoutent peu de valeur au sens. Les supprimer aide le modèle à voir les mots qui comptent.
nltk.download('stopwords')
nltk.download('wordnet')
#nltk.download('omw-1.4')
def remove_stopwords(tokenized_column):
stops = set(stopwords.words("english"))
return [word for word in tokenized_column if not word in stops]
data['stopwords_removed'] = data.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
data.head()
Bien que les deux techniques soient similaires, elles produisent des résultats différents. Il est donc important de déterminer celle qui convient à l'analyse qu'on souhaite effectuer.
Le stemming, le plus simple des deux, regroupe les mots par leur radical racine. Cela nous permet de reconnaître que 'jumping' 'jumps' et 'jumped' sont tous enracinés dans le même verbe (jump) et font donc référence à des problèmes similaires.
La lemmatisation, d'autre part, regroupe les mots en fonction de la définition de la racine et nous permet de différencier le présent, le passé et l'indéfini.
def apply_stemming(tokenized_column):
stemmer = PorterStemmer()
return [stemmer.stem(word) for word in tokenized_column]
data['porter_stemmed'] = data.apply(lambda x: apply_stemming(x['stopwords_removed']), axis=1)
data.head()
def apply_lemmatize(tokenized_column):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(word) for word in tokenized_column]
data['lemmatize'] = data.apply(lambda x: apply_lemmatize(x['stopwords_removed']), axis=1)
data.head()
#pd.set_option('display.max_columns', None)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)
data[['tokenized','stopwords_removed','porter_stemmed','lemmatize']][:1].transpose()
Maintenant, assemblons les tokens traités
lemm = data['lemmatize']
for i in range(len(lemm)):
lemm[i] = ' '.join(lemm[i])
stemm = data['porter_stemmed']
for i in range(len(stemm)):
stemm[i] = ' '.join(stemm[i])
Bag of Words est une représentation qui transforme un texte arbitraire en vecteurs de longueur fixe en comptant le nombre de fois où chaque mot apparaît.
CountVectorizer est utilisé pour convertir une collection de documents texte en un vecteur de nombre de termes
# création du bag of words (CountVectorizer)
cvect = CountVectorizer(stop_words='english', max_df=0.95, min_df=1)
cv_transform_lem = cvect.fit_transform(data['lemmatize'])
features_names=cvect.get_feature_names()
word_fre_vect_lem=pd.DataFrame(cv_transform_lem.toarray(),columns=cvect.get_feature_names())
word_fre_vect_lem.head(5)
word_fre_vect_lem.shape
plt.figure(figsize=(10, 5))
occ = sns.barplot(data=word_fre_vect_lem.sum().sort_values(
ascending=False).to_frame().head(30).T)
for item in occ.get_xticklabels():
item.set_rotation(90)
cv_transform_stem = cvect.fit_transform(data['porter_stemmed'])
features_names=cvect.get_feature_names()
word_fre_vect_stem=pd.DataFrame(cv_transform_stem.toarray(),columns=cvect.get_feature_names())
word_fre_vect_stem.head(5)
word_fre_vect_stem.shape
plt.figure(figsize=(10, 5))
occ = sns.barplot(data=word_fre_vect_stem.sum().sort_values(
ascending=False).to_frame().head(30).T)
for item in occ.get_xticklabels():
item.set_rotation(90)
Le TF-IDF (pour Term Frequency et Inverse Document Frequency) est une mesure utilisée pour déterminer la pertinence d’un terme dans un document. Cette méthode prend en compte non seulement l'occurrence d'un mot dans une seule description mais aussi dans le corpus entier. TF-IDF fonctionne en pénalisant les mots communs en leur attribuant des poids inférieurs tout en donnant de l'importance aux mots qui sont rares dans l'ensemble du corpus mais qui apparaissent en bon nombre dans peu de commentaires.
Termes importants liés à TF-IDF:
La formule prend en compte la fréquence d’un terme (TF) dans un document donné ainsi que le nombre de documents contenant ce mot (IDF)
# création du bag of words (Tf-idf)
ctf = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=1)
ctf_transform_lem = ctf.fit_transform(data['lemmatize'])
features_names=ctf.get_feature_names()
word_fre_tfidf_lem=pd.DataFrame(ctf_transform_lem.toarray(),columns=ctf.get_feature_names())
word_fre_tfidf_lem.head(5)
word_fre_tfidf_lem.shape
plt.figure(figsize=(10, 5))
occ = sns.barplot(data=word_fre_tfidf_lem.sum().sort_values(
ascending=False).to_frame().head(30).T)
for item in occ.get_xticklabels():
item.set_rotation(90)
ctf_transform_stem = ctf.fit_transform(data['porter_stemmed'])
features_names=ctf.get_feature_names()
word_fre_tfidf_stem=pd.DataFrame(ctf_transform_stem.toarray(),columns=ctf.get_feature_names())
word_fre_tfidf_stem.head(5)
word_fre_tfidf_stem.shape
plt.figure(figsize=(10, 5))
occ = sns.barplot(data=word_fre_tfidf_stem.sum().sort_values(
ascending=False).to_frame().head(30).T)
for item in occ.get_xticklabels():
item.set_rotation(90)
Le LDA est un algorithme d'apprentissage qui cherche à maximiser la variance inter-classes par rapport à celle intra classe. Le LDA permets de faire une modélisation de sujets (topics) pour classer le texte d'un document dans un sujet particulier.
# Creation of LDA model
lda_model = LatentDirichletAllocation(
n_components=10,
max_iter=10,
learning_method='online',
learning_offset=10.,
random_state=42)
def lda(vector):
LDA = lda_model.fit_transform(vector)
return LDA
def display_topics(lda_model, feature_names, no_top_words):
for topic_idx, topic in enumerate(lda_model.components_):
print("Topic {}:".format(topic_idx))
print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
lda_tf_lem = lda(word_fre_vect_lem)
lda_tf_idf_lem = lda(word_fre_tfidf_lem)
lda_tf_stem = lda(word_fre_vect_stem)
lda_tf_idf_stem = lda(word_fre_tfidf_stem)
Présentation des 10 premiers topics obtenu par LatentDirichletAllocation sur CountVectorizer
display_topics(lda_model, cvect.get_feature_names(), 3)
# PCA Pipeline
pca = PCA(svd_solver='full')
data_pca = pca.fit_transform(word_fre_vect_lem)
# Explained variance
varexpl = pca.explained_variance_ratio_*100
# Plot of cumulated variance
plt.figure(figsize=(12,8))
plt.bar(np.arange(len(varexpl))+1, varexpl)
cumSumVar = varexpl.cumsum()
plt.plot(np.arange(len(varexpl))+1, cumSumVar,c="red",marker='o')
valid_idx = np.where(cumSumVar >= 95)[0]
min_plans = valid_idx[cumSumVar[valid_idx].argmin()]+1
plt.xlabel("rang de l'axe d'inertie")
plt.xticks(np.arange(len(varexpl))+1)
plt.ylabel("pourcentage d'inertie")
plt.title("{}% de la variance totale est expliquée"\
" par les {} premiers axes".format(95,
min_plans))
plt.show(block=False)
def pca(vector):
pca = PCA(n_components=0.95)
ft_pca = pca.fit_transform(vector)
return ft_pca
pca_tf_lem = pca(word_fre_vect_lem)
pca_tf_idf_lem = pca(word_fre_tfidf_lem)
pca_tf_stem = pca(word_fre_vect_stem)
pca_tf_idf_stem = pca(word_fre_tfidf_stem)
La t-SNE est une technique de réduction de dimension non linéaire non supervisée. Il intègre (Embedding) les points d'une dimension supérieure à une dimension inférieure en essayant de préserver le voisinage de ce point.
tsne = TSNE(n_components=2, verbose=1, perplexity=80,n_iter=5000, learning_rate=200, random_state=42)
def TSNE (dimension):
res_tsne = tsne.fit_transform(dimension)
res_tsne_df = pd.DataFrame(res_tsne, columns=['tsne1', 'tsne2'])
return res_tsne_df
tsne_pca_tf_lem = TSNE(pca_tf_lem)
tsne_pca_tf_idf_lem = TSNE(pca_tf_idf_lem)
tsne_lda_tf_lem = TSNE(lda_tf_lem)
tsne_lda_tf_idf_lem = TSNE(lda_tf_idf_lem)
tsne_pca_tf_stem = TSNE(pca_tf_stem)
tsne_pca_tf_idf_stem = TSNE(pca_tf_idf_stem)
tsne_lda_tf_stem = TSNE(lda_tf_stem)
tsne_lda_tf_idf_stem = TSNE(lda_tf_idf_stem)
list_ari = []
def plot_kmeans_tsne(reduction, title, filename, colname):
kmeans_tsne = KMeans(n_clusters=7, n_init=50, max_iter=200,init='k-means++', random_state=42).fit(reduction)
labels_tsne = kmeans_tsne.labels_
cl_tsne = pd.concat([reduction,pd.DataFrame({'tsne_clusters':labels_tsne})],axis=1)
data[f'cluster {colname}'] = labels_tsne
categories_predict = data[f'cluster {colname}']
categories_true = data['product_category_1']
adjusted_rand = metrics.adjusted_rand_score(categories_true, categories_predict)
list_ari.append(adjusted_rand)
print("\033[1mAdjusted Rand Index: %0.3f\033[0m" % adjusted_rand)
fig = px.scatter(data, x=cl_tsne.iloc[:,0], y = cl_tsne.iloc[:,1], color=categories_true, title=f"Représentation selon les vraies classes {title}")
fig1 = px.scatter(data, x = cl_tsne.iloc[:,0],y = cl_tsne.iloc[:,1], color=categories_predict, title = f"Représentation selon les clusters {title}")
plotly.offline.plot(fig, filename=f'plots/{filename}.html')
plotly.offline.plot(fig1, filename=f'plots/{filename}_cluster.html')
return fig.show(), fig1.show()
#visualizing CountVectorizer bag of words with PCA reduction by using 2D TSNE
plot_kmeans_tsne(tsne_pca_tf_lem,
"Cluster Kmeans based on lemmatized CountVectorizer with PCA (TSNE)",
"Kmeans_CountVect_lem_PCA_tsne","cvec_lem_pca_tsne")
Le graphique représente les produits classifiés en réel et ce que le modèle kmeans prédit. On voit que c'est assez mal classifié.
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster cvec_lem_pca_tsne'] == x].index
for x in data['cluster cvec_lem_pca_tsne'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
Par rapport à la répartition des clusters, on peut voir que les montres sont très bien reconnues par le modèle mais sues plusieurs autres produits, le modèle n'arrive pas à reconnaitre la catégorie des produits
#visualizing TF-IDF bag of words with PCA reduction by using 2D TSNE
plot_kmeans_tsne(tsne_pca_tf_idf_lem,
"Cluster Kmeans based on lemmatized TF-IDF with PCA (TSNE)",
"Kmeans_TFIDF_lem_PCA", "TFIDF_lem_PCA")
Ici aussi on peut voir que les produits sont mal classifié
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster TFIDF_lem_PCA'] == x].index
for x in data['cluster TFIDF_lem_PCA'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
#visualizing CountVectorizer bag of words with LDA reduction by using 2D TSNE
plot_kmeans_tsne(tsne_lda_tf_lem,
"Cluster Kmeans based on lemmatized CountVectorizer with LDA (TSNE)",
"Kmeans_CountVec_lem_LDA","CVec_lem_LDA")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster CVec_lem_LDA'] == x].index
for x in data['cluster CVec_lem_LDA'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
list_ari2 = []
def plot_kmeans_tsne(reduction, title, filename, colname):
kmeans_tsne = KMeans(n_clusters=7, n_init=50, max_iter=200,init='k-means++', random_state=42).fit(reduction)
labels_tsne = kmeans_tsne.labels_
cl_tsne = pd.concat([reduction,pd.DataFrame({'tsne_clusters':labels_tsne})],axis=1)
data[f'cluster {colname}'] = labels_tsne
categories_predict = data[f'cluster {colname}']
categories_true = data['product_category_1']
adjusted_rand = metrics.adjusted_rand_score(categories_true, categories_predict)
list_ari2.append(adjusted_rand)
print("\033[1mAdjusted Rand Index: %0.3f\033[0m" % adjusted_rand)
sns.scatterplot(data = data, x=cl_tsne.iloc[:,0], y = cl_tsne.iloc[:,1], hue=categories_true)
plt.title(f"Représentation selon les classes {title}")
return plt.show()
#visualizing TF-IDF bag of words with LDA reduction by using 2D TSNE
plt.figure(figsize=(15,8))
plot_kmeans_tsne(tsne_lda_tf_idf_lem,
"Cluster Kmeans based on lemmatized TF-IDF with LDA (TSNE)",
"Kmeans_TFIDF_lem_LDA", "TFIDF_lem_LDA")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster TFIDF_lem_LDA'] == x].index
for x in data['cluster TFIDF_lem_LDA'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
#visualizing CountVectorizer bag of words with PCA reduction by using 2D TSNE
plt.figure(figsize=(15,8))
plot_kmeans_tsne(tsne_pca_tf_stem,
"Cluster Kmeans based on stemmed CountVectorizer with PCA (TSNE)",
"Kmeans_CountVec_stem_PCA", "CountVec_stem_PCA")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster CountVec_stem_PCA'] == x].index
for x in data['cluster CountVec_stem_PCA'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
#visualizing TF-IDF bag of words with PCA reduction by using 2D TSNE
plt.figure(figsize=(15,8))
plot_kmeans_tsne(tsne_pca_tf_idf_stem,
"Cluster Kmeans based on stemmed TF-IDF with PCA (TSNE)",
"Kmeans_TFIDF_stem_PCA", "TFIDF_stem_PCA")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster TFIDF_stem_PCA'] == x].index
for x in data['cluster TFIDF_stem_PCA'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
#visualizing CountVectorizer bag of words with LDA reduction by using 2D TSNE
plt.figure(figsize=(15,8))
plot_kmeans_tsne(tsne_lda_tf_stem,
"Cluster Kmeans based on stemmed CountVectorizer with LDA (TSNE)",
"Kmeans_countVec_stem_LDA", "countVec_stem_LDA")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster countVec_stem_LDA'] == x].index
for x in data['cluster countVec_stem_LDA'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
#visualizing CountVectorizer bag of words with LDA reduction by using 2D TSNE
plt.figure(figsize=(15,8))
plot_kmeans_tsne(tsne_lda_tf_idf_lem,
"Cluster Kmeans based on stemmed TF-IDF with LDA (TSNE)",
"Kmeans_TFIDF_stem_LDA", "TFIDF_stem_LDA")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster TFIDF_stem_LDA'] == x].index
for x in data['cluster TFIDF_stem_LDA'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
Word embedding are also a very popular way to approach an NLP problem in which words are converted into vectors and used in various ML and deeplearning models.
Word2Vec est un algorithme non supervisé qui utilise un réseau de neurones à 3 couches (1 couche d’entrée, 1 couche cachée, 1 couche de sortie). Word2Vec génère des plongements indépendants du contexte : c'est-à-dire qu'il n'y a qu'une seule représentation vectorielle pour chaque mot.
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
maxlen = 2524 # adapt to length of sentences
sentences = data['lemmatize'].to_list()
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]
# Création et entraînement du modèle Word2Vec
print("Build & train Word2Vec model ...")
w2v_model = Word2Vec(min_count=w2v_min_count, window=w2v_window, vector_size=w2v_size, seed=42, workers=1, epochs=w2v_epochs)
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")
#print('words:',w2v_words)
# Préparation des sentences (tokenization)
print("Fit Tokenizer ...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
x_sentences = pad_sequences(tokenizer.texts_to_sequences(sentences),
maxlen=maxlen,
padding='post')
num_words = len(tokenizer.word_index) + 1
print("Number of unique words: %i" % num_words)
plt.figure(num=None, figsize=(15, 10), facecolor='w', edgecolor='k')
# fit a 2d PCA model to the vectors
X = model_vectors[model_vectors.index_to_key]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# # create a scatter plot of the projection
plt.scatter(result[:, 0], result[:, 1])
words = list(model_vectors.index_to_key)
for i, word in enumerate(words):
plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()
# Création de la matrice d'embedding
print("Create Embedding matrix ...")
w2v_size = 300
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, w2v_size))
i=0
j=0
for word, idx in word_index.items():
i +=1
if word in w2v_words:
j +=1
embedding_vector = model_vectors[word]
if embedding_vector is not None:
embedding_matrix[idx] = model_vectors[word]
word_rate = np.round(j/i,4)
print("Word embedding rate : ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix.shape))
# Création du modèle
input=Input(shape=(len(x_sentences),maxlen),dtype='float64')
word_input=Input(shape=(maxlen,),dtype='float64')
word_embedding=Embedding(input_dim=vocab_size,
output_dim=w2v_size,
weights = [embedding_matrix],
input_length=maxlen)(word_input)
word_vec=GlobalAveragePooling1D()(word_embedding)
embed_model = Model([word_input],word_vec)
embed_model.summary()
embeddings = embed_model.predict(x_sentences)
embeddings.shape
X_tsne_w2v = tsne.fit_transform(embeddings)
df_tsne_w2v = pd.DataFrame(X_tsne_w2v[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_w2v.shape)
plt.figure(figsize=(15,8))
plot_kmeans_tsne(df_tsne_w2v,
"Cluster Kmeans based on lemmatized word2vec",
"Kmeans_word2vec_lemmatize", "word2vec_lem")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster word2vec_lem'] == x].index
for x in data['cluster word2vec_lem'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
sentences_stem = data['porter_stemmed'].to_list()
sentences_stem = [gensim.utils.simple_preprocess(text) for text in sentences_stem]
# Création et entraînement du modèle Word2Vec
print("Build & train Word2Vec model ...")
w2v_model = Word2Vec(min_count=w2v_min_count, window=w2v_window, vector_size=w2v_size, seed=42, workers=1, epochs=w2v_epochs)
w2v_model.build_vocab(sentences_stem)
w2v_model.train(sentences_stem, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)
model_vectors_stem = w2v_model.wv
w2v_words_stem = model_vectors_stem.index_to_key
print("Vocabulary size: %i" % len(w2v_words_stem))
print("Word2Vec trained")
#print('words:',w2v_words)
# Préparation des sentences (tokenization)
print("Fit Tokenizer ...")
tokenizer_stem = Tokenizer()
tokenizer_stem.fit_on_texts(sentences_stem)
x_sentences_stem = pad_sequences(tokenizer_stem.texts_to_sequences(sentences_stem),
maxlen=maxlen,
padding='post')
num_words_stem = len(tokenizer_stem.word_index) + 1
print("Number of unique words: %i" % num_words_stem)
plt.figure(num=None, figsize=(15, 10), facecolor='w', edgecolor='k')
# fit a 2d PCA model to the vectors
X = model_vectors_stem[model_vectors_stem.index_to_key]
pca_stem = PCA(n_components=2)
result_stem = pca_stem.fit_transform(X)
# # create a scatter plot of the projection
plt.scatter(result_stem[:, 0], result_stem[:, 1])
words_stem = list(model_vectors_stem.index_to_key)
for i, word in enumerate(words_stem):
plt.annotate(word, xy=(result_stem[i, 0], result_stem[i, 1]))
plt.show()
# Création de la matrice d'embedding
print("Create Embedding matrix ...")
w2v_size = 300
word_index_stem = tokenizer_stem.word_index
vocab_size = len(word_index) + 1
embedding_matrix_stem = np.zeros((vocab_size, w2v_size))
i=0
j=0
for word, idx in word_index_stem.items():
i +=1
if word in w2v_words_stem:
j +=1
embedding_vector = model_vectors_stem[word]
if embedding_vector is not None:
embedding_matrix_stem[idx] = model_vectors_stem[word]
word_rate = np.round(j/i,4)
print("Word embedding rate : ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix_stem.shape))
# Création du modèle
input=Input(shape=(len(x_sentences_stem),maxlen),dtype='float64')
word_input_stem=Input(shape=(maxlen,),dtype='float64')
word_embedding_stem=Embedding(input_dim=vocab_size,
output_dim=w2v_size,
weights = [embedding_matrix],
input_length=maxlen)(word_input_stem)
word_vec=GlobalAveragePooling1D()(word_embedding_stem)
embed_model_stem = Model([word_input_stem],word_vec)
embed_model_stem.summary()
embeddings_stem = embed_model_stem.predict(x_sentences_stem)
embeddings_stem.shape
X_tsne_w2v_stem = tsne.fit_transform(embeddings_stem)
df_tsne_w2v_stem = pd.DataFrame(X_tsne_w2v_stem[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_w2v_stem.shape)
plot_kmeans_tsne(df_tsne_w2v_stem,
"Cluster Kmeans based on stemmed word2vec",
"Kmeans_word2vec_stemmed", "word2vec_stemmed")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster word2vec_stemmed'] == x].index
for x in data['cluster word2vec_stemmed'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
Le modèle BERT génère des plongements qui permettent d'avoir plusieurs représentations vectorielles pour le même mot, en fonction du contexte dans lequel le mot est utilisé.
# Fonction de préparation des sentences
def bert_inp_fct(sentences, bert_tokenizer, max_length) :
input_ids=[]
token_type_ids = []
attention_mask=[]
bert_inp_tot = []
for sent in sentences:
bert_inp = bert_tokenizer.encode_plus(sent,
add_special_tokens = True,
max_length = max_length,
padding='max_length',
return_attention_mask = True,
return_token_type_ids=True,
truncation=True,
return_tensors="tf")
input_ids.append(bert_inp['input_ids'][0])
token_type_ids.append(bert_inp['token_type_ids'][0])
attention_mask.append(bert_inp['attention_mask'][0])
bert_inp_tot.append((bert_inp['input_ids'][0],
bert_inp['token_type_ids'][0],
bert_inp['attention_mask'][0]))
input_ids = np.asarray(input_ids)
token_type_ids = np.asarray(token_type_ids)
attention_mask = np.array(attention_mask)
return input_ids, token_type_ids, attention_mask, bert_inp_tot
# Fonction de création des features
def feature_BERT_fct(model, model_type, sentences, max_length, b_size, mode='HF') :
batch_size = b_size
batch_size_pred = b_size
bert_tokenizer = AutoTokenizer.from_pretrained(model_type)
time1 = time.time()
for step in range(len(sentences)//batch_size) :
idx = step*batch_size
input_ids, token_type_ids, attention_mask, bert_inp_tot = bert_inp_fct(sentences[idx:idx+batch_size],
bert_tokenizer, max_length)
if mode=='HF' : # Bert HuggingFace
outputs = model.predict([input_ids, attention_mask, token_type_ids], batch_size=batch_size_pred)
last_hidden_states = outputs.last_hidden_state
if mode=='TFhub' : # Bert Tensorflow Hub
text_preprocessed = {"input_word_ids" : input_ids,
"input_mask" : attention_mask,
"input_type_ids" : token_type_ids}
outputs = model(text_preprocessed)
last_hidden_states = outputs['sequence_output']
if step ==0 :
last_hidden_states_tot = last_hidden_states
last_hidden_states_tot_0 = last_hidden_states
else :
last_hidden_states_tot = np.concatenate((last_hidden_states_tot,last_hidden_states))
features_bert = np.array(last_hidden_states_tot).mean(axis=1)
time2 = np.round(time.time() - time1,0)
print("temps traitement : ", time2)
return features_bert, last_hidden_states_tot
# Guide sur le Tensorflow hub : https://www.tensorflow.org/text/tutorials/classify_text_with_bert
#model_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
model_url = hub.load("bert_en_uncased_L-12_H-768_A-12_4/")
bert_layer = hub.KerasLayer(model_url, trainable=True)
sentences = data['lemmatize'].to_list()
max_length = 64
batch_size = 10
model_type = 'bert-base-uncased'
model = bert_layer
features_bert, last_hidden_states_tot = feature_BERT_fct(model, model_type, sentences,
max_length, batch_size, mode='TFhub')
X_tsne_bert = tsne.fit_transform(features_bert)
df_tsne_bert = pd.DataFrame(X_tsne_bert[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_bert.shape)
plt.figure(figsize=(15,8))
plot_kmeans_tsne(df_tsne_bert,
"Cluster Kmeans based on lemmatized BERT",
"Kmeans_BERT_lem", "BERT_lem")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster BERT_lem'] == x].index
for x in data['cluster BERT_lem'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
sentences_stem = data['porter_stemmed'].to_list()
features_bert_stem, last_hidden_states_tot_stem = feature_BERT_fct(model, model_type, sentences_stem,
max_length, batch_size, mode='TFhub')
X_tsne_bert_stem = tsne.fit_transform(features_bert_stem)
df_tsne_bert_stem = pd.DataFrame(X_tsne_bert_stem[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_bert_stem.shape)
plt.figure(figsize=(15,8))
plot_kmeans_tsne(df_tsne_bert_stem,
"Cluster Kmeans based on stemmed BERT",
"Kmeans_BERT_stem", "BERT_stem")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster BERT_stem'] == x].index
for x in data['cluster BERT_stem'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
USE calcule une représentation vectorielle d’un texte, cette représentation respecte la proximité sémantique (similarité) des textes entre eux. Le modèle permets d’identifier l’importance des mots dans un contexte en fonction de leur position et de leur identité.
def feature_USE_fct(sentences, b_size) :
batch_size = b_size
for step in range(len(sentences)//batch_size) :
idx = step*batch_size
feat = embed(sentences[idx:idx+batch_size])
if step ==0 :
features = feat
else :
features = np.concatenate((features,feat))
return features
batch_size = 10
sentences = data["lemmatize"].to_list()
tf_tensor = embed(sentences)
df_use = pd.DataFrame(tens.make_ndarray(tens.make_tensor_proto(tf_tensor)),
index=data.index,
columns=['dim'+str(i) for i in range(512)])
tsne_results_use = tsne.fit_transform(df_use)
df_tsne_use = pd.DataFrame(tsne_results_use[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_use.shape)
plt.figure(figsize=(15,8))
plot_kmeans_tsne(df_tsne_use,
"Cluster Kmeans based on lemmatized USE",
"Kmeans_USE_lem", "USE_lem")
Ce graphique montre que le modèle arrive à différencier plusieurs catégories avec un ARI score de 0.428.
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster USE_lem'] == x].index
for x in data['cluster USE_lem'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
sentences_stem = data["porter_stemmed"].to_list()
features_USE_stem = feature_USE_fct(sentences_stem, batch_size)
features_USE_stem
X_tsne_use_stem = tsne.fit_transform(features_USE_stem)
df_tsne_use_stem = pd.DataFrame(X_tsne_use_stem[:,0:2], columns=['tsne1', 'tsne2'])
print(df_tsne_use_stem.shape)
plt.figure(figsize=(15,8))
plot_kmeans_tsne(df_tsne_use_stem,
"Cluster Kmeans based on stemmed USE",
"Kmeans_USE_stem", "USE_stem")
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster USE_stem'] == x].index
for x in data['cluster USE_stem'].value_counts().index]
plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
order = data.loc[index_tot[x], 'product_category_1'].value_counts()
order_hue = order.index
plt.subplot(4, len(index_tot)/3, x+1)
sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
order=order_hue,
palette='Blues_r')
plt.title(f"Cluster {x}", fontsize=20)
list_aris = [*list_ari, *list_ari2]
df_ari=pd.DataFrame([list_aris]
,columns=['km_pca_tf_lem','km_pca_tf_idf_lem','km_lda_tf_lem','km_lda_tf_idf_lem',
'km_pca_tf_stem','km_pca_tf_idf_stem','km_lda_tf_stem','km_lda_tf_idf_stem',
'km_word2vec_lem','km_word2vec_stem','km_bert_lem','km_bert_stem','km_use_lem','km_use_stem'],
index=['ARI_SCORE'])
df_ari.T.round(2).plot(kind="bar",figsize=(10,6))
plt.xlabel("Model")
plt.ylabel("ARI Score")
Le modèle Kmeans avec le USE et lemmatisé donne le meilleur résultat avec 0.432
df_use.to_csv("Flipkart/df_use.csv")
data.to_csv("Flipkart/data_cleaned.csv")
df_ari.to_csv("Flipkart/ari.csv")